package wCount;

import java.util.*;
import java.util.Map.Entry;

public class WordDeal // 该类用于进行文件中的单词等处理
{

	String text; // 文件中内容
	private int charNum; // 字符个数
	private int wordCount; // 单词总数
	private int ValidLine; // 有效行数
	private Map<String, Integer> wordFreq; // 单词词频

	public WordDeal(String text) {
		this.text = text;
	}

	public int getCharCount() // 统计文件字符数
	{
		char c;
		int i = 0;
		while (i < text.length()) {
			c = text.charAt(i);
			if (c >= 32 && c <= 126 || c == '\r' || c == '\n' || c == '\t') {
				charNum++;
			}
			i++;
		}
		return charNum;
	}

	public int getWordCount() // 统计单词总数
	{
		String t = text;
		String[] spWord = t.split("\\s"); // 分词
		for (int i = 0; i < spWord.length; i++) {
			if (spWord[i].length() < 4) { // 判断长度是否大于等于4
				continue;
			} else {
				int flag = 1; // 判断字符串的前四位是否是英文字母
				char c;
				for (int j = 0; j < 4; j++) {
					c = spWord[i].charAt(j);
					if (!(c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z')) {
						flag = 0;
					}
				}
				if (flag == 1) {
					wordCount++;
				}
			}
		}
		return wordCount;
	}

	/*
	 * public Map getWordFreq() //
	 * 统计单词词频(单词：以4个英文字母开头，跟上字母数字符号，单词以分隔符分割，不区分大小写。) { wordFreq = new
	 * HashMap<String, Integer>(); String t = text;
	 * 
	 * String[] spWord = t.split("\\s"); // 对字符串进行分词操作 for (int i = 0; i <
	 * spWord.length; i++) { if (spWord[i].length() < 4) { // 判断长度是否大于等于4
	 * continue; } else {
	 * 
	 * int flag = 1; // 判断字符串的前四位是否是英文字母 char c;
	 * 
	 * for (int j = 0; j < 4; j++) { c = spWord[i].charAt(j);
	 * 
	 * if (!(c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z')) { flag = 0; } } if
	 * (flag == 1) { // 将字符串转化为小写 spWord[i] = spWord[i].trim().toLowerCase(); if
	 * (wordFreq.get(spWord[i]) == null) { // 判断之前Map中是否出现过该字符串
	 * wordFreq.put(spWord[i], 1); } else wordFreq.put(spWord[i],
	 * wordFreq.get(spWord[i]) + 1);
	 * 
	 * } } } return wordFreq; }
	 */

	public List getWordFreq() { // 对单词词频的Map进行排序

		wordFreq = new HashMap<String, Integer>();
		String t = text;

		String[] spWord = t.split("\\s"); // 分词
		for (int i = 0; i < spWord.length; i++) {
			if (spWord[i].length() < 4) {
				continue;
			} else {

				int flag = 1;
				char c;

				for (int j = 0; j < 4; j++) {
					c = spWord[i].charAt(j);

					if (!(c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z')) {
						flag = 0;
					}
				}
				if (flag == 1) {
					spWord[i] = spWord[i].trim().toLowerCase();
					if (wordFreq.get(spWord[i]) == null) {
						wordFreq.put(spWord[i], 1);
					} else
						wordFreq.put(spWord[i], wordFreq.get(spWord[i]) + 1);

				}
			}
		}

		List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(wordFreq.entrySet());
		Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {

			@Override
			public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) { // 对Map中内容进行排序，先按词频后按字典顺序
				if (o1.getValue() == o2.getValue()) {
					return o1.getKey().compareTo(o2.getKey());
				}
				return o2.getValue() - o1.getValue();
			}

		});
		return list;
	}

	public int getLineCount() { // 统计有效行数

		String[] line = text.split("\r\n"); // 将每一行分开放入一个字符串数组
		for (int i = 0; i < line.length; i++) { // 找出无效行，统计有效行

			if (line[i].trim().length() == 0)
				continue;
			ValidLine = ValidLine + 1;
		}
		return ValidLine;
	}

	public String[] ListToArray(List<Map.Entry<String, Integer>> list) { // 将排完序的List元素筛选出前十个并存入数组
		String[] arr;
		int i = 0;
		int len = list.size();
		if (len <= 10) {
			arr = new String[len];
			for (Map.Entry<String, Integer> m : list) {
				arr[i] = "<" + m.getKey() + ">:" + m.getValue();
				i++;
			}
		} else {
			arr = new String[10];
			for (Map.Entry<String, Integer> m : list) {
				if (i == 10)
					break;
				arr[i] = "<" + m.getKey() + ">:" + m.getValue();
				i++;
			}
		}
		return arr;
	}

}
